#-------------------------------------------------------------
# 4_logistic
# Leontine Alkema
# based on data and code examples from Bayesian Biostatistics (Lesaffre and Lawson)
#-------------------------------------------------------------
# settings
#setwd("C:/Users/lalkema/Documents/Bayes Umass/")
figdir = "fig/"
library(rjags)
## Loading required package: coda
## Linked to JAGS 4.3.0
## Loaded modules: basemod,bugs
library(R2jags)
##
## Attaching package: 'R2jags'
## The following object is masked from 'package:coda':
##
## traceplot
source("/Users/gcgibson/Downloads/functions.R")
#----
# data
#-----
# Description of the study (taken from LL)
# This is a randomized controlled clinical trial comparing two oral treatments for toenail toenail dermatophyte onychomycosis with either of two oral medications: Itraconazol 250 mg daily (treat=0) or Lamisil 250 mg daily (treat=1). The patients received treatment for twelve weeks and were evaluated at 0, 1, 2, 3, 6, 9 and 12 months. As response we have taken the the binarized degree of onycholysis of a subgroup of 294 patients.
# Reference: De Backer, M. and De Keyser, P. and De Vroey, C. and Lesaffre, E. A 12-week treatment for dermatophyte toe onycholysis: terbinafine 250 mg/day vs itraconazole 200 mg/day - a double-blind comparative trial, British Journal of Dermatology, 134, 16-17, 1996
#dat <- read.table(file = "data/toenailb.txt", header = T)
wells <- read.table ("/Users/gcgibson/Downloads/wells.dat")
dim(wells)
## [1] 3020 5
# these are the transformed versions of the variables that will be used
y.i <- wells$switch
n <- length(y.i)
dist100 <- wells$dist/100
# I just center the distance and arsenic variables from the start
c.dist100 <- dist100 - mean (dist100)
c.arsenic <- wells$arsenic - mean (wells$arsenic)
log.arsenic <- log (wells$arsenic)
c.log.arsenic <- log.arsenic - mean (log.arsenic)
# # not yet used
# educ4 <- educ/4
# c.educ4 <- educ4 - mean(educ4)
#-----
# fitting
model <- "
model {
for( i in 1:n) {
logit(p.i[i]) <- (
# different notation; using betas for main effects and b's for varying effects
b0
+ b1*c.dist100[i]
)
y.i[i] ~ dbern(p.i[i])
}
b0 ~ dnorm(0.0,1.0E-4)
b1 ~ dnorm(0.0,1.0E-4)
} # end model
"
# with random slopes
jags.data2 <- list(y.i = y.i,
c.dist100 = c.dist100, n=n)
parnames2 <- c( "b1", "b0")
# note: error messages may be less informative with jags parallel so go back to using "jags" (the non-parallel version)
# if you don't understand an error message
mod2 <- jags(data = jags.data2,
parameters.to.save=parnames2,
n.chains = 3, n.burnin = 1500, n.iter = 1500+10000, n.thin = 10, model.file = textConnection(model))
## module glm loaded
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 3020
## Unobserved stochastic nodes: 2
## Total graph size: 14859
##
## Initializing model
max(mod2$BUGSoutput$summary[, c("Rhat")])
## [1] 1.002216
min(mod2$BUGSoutput$summary[, c("n.eff")])
## [1] 2800
which.max(mod2$BUGSoutput$summary[, c("Rhat")])
## deviance
## 3
which.min(mod2$BUGSoutput$summary[, c("n.eff")])
## deviance
## 3
# just some trace plots
PlotTrace("b0", mod2$BUGSoutput$sims.array)
PlotTrace("b1", mod2$BUGSoutput$sims.array)
# some priors and posteriors to check that priors weren't informative
mcmc.array <- mod2$BUGSoutput$sims.array
par(lwd = 3, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5,mar = c(5,5,1,1), mfrow = c(1,3))
for (k in 0:1){
hist(c(mcmc.array[,,paste0("b", k, "")]), freq = F, main = "", xlab = paste0("Coeff ", k))
curve(dnorm(x,0,sqrt(1/10^(-4))), add = T, col = 2)
}
#-----
# fitting
model <- "
model {
for( i in 1:n) {
logit(p.i[i]) <- (
# different notation; using betas for main effects and b's for varying effects
a0 + a1*c.dist100[i] + a2*c.arsenic[i]
)
y.i[i] ~ dbern(p.i[i])
}
a0 ~ dnorm(0.0,1.0E-4)
a1 ~ dnorm(0.0,1.0E-4)
a2 ~ dnorm(0.0,1.0E-4)
} # end model
"
# with random slopes
jags.data2 <- list(y.i = y.i,
c.dist100 = c.dist100, c.arsenic=c.arsenic, n=n)
parnames2 <- c( "a1", "a0","a2")
# note: error messages may be less informative with jags parallel so go back to using "jags" (the non-parallel version)
# if you don't understand an error message
mod2 <- jags(data = jags.data2,
parameters.to.save=parnames2,
n.chains = 3, n.burnin = 1500, n.iter = 1500+10000, n.thin = 10, model.file = textConnection(model))
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 3020
## Unobserved stochastic nodes: 3
## Total graph size: 18466
##
## Initializing model
max(mod2$BUGSoutput$summary[, c("Rhat")])
## [1] 1.002251
min(mod2$BUGSoutput$summary[, c("n.eff")])
## [1] 1100
which.max(mod2$BUGSoutput$summary[, c("Rhat")])
## a1
## 2
which.min(mod2$BUGSoutput$summary[, c("n.eff")])
## a1
## 2
# just some trace plots
PlotTrace("a0", mod2$BUGSoutput$sims.array)
PlotTrace("a1", mod2$BUGSoutput$sims.array)
PlotTrace("a2", mod2$BUGSoutput$sims.array)
# some priors and posteriors to check that priors weren't informative
mcmc.array <- mod2$BUGSoutput$sims.array
par(lwd = 3, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5,mar = c(5,5,1,1), mfrow = c(1,3))
for (k in 0:2){
hist(c(mcmc.array[,,paste0("a", k, "")]), freq = F, main = "", xlab = paste0("Coeff ", k))
curve(dnorm(x,0,sqrt(1/10^(-4))), add = T, col = 2)
}
a0_mean <- mean(mcmc.array[,,"a0"])
a1_mean <- mean(mcmc.array[,,"a1"])
a2_mean <- mean(mcmc.array[,,"a2"])
#-----
# fitting
model <- "
model {
for( i in 1:n) {
logit(p.i[i]) <- (
# different notation; using betas for main effects and b's for varying effects
b0 + b1*c.dist100[i] + b2*c.arsenic[i] + b3*c.arsenic[i]*c.dist100[i]
)
y.i[i] ~ dbern(p.i[i])
}
b0 ~ dnorm(0.0,1.0E-4)
b1 ~ dnorm(0.0,1.0E-4)
b2 ~ dnorm(0.0,1.0E-4)
b3 ~ dnorm(0.0,1.0E-4)
} # end model
"
# with random slopes
jags.data2 <- list(y.i = y.i,
c.dist100 = c.dist100, c.arsenic=c.arsenic, n=n)
parnames2 <- c( "b1", "b0","b2","b3")
# note: error messages may be less informative with jags parallel so go back to using "jags" (the non-parallel version)
# if you don't understand an error message
mod2 <- jags(data = jags.data2,
parameters.to.save=parnames2,
n.chains = 3, n.burnin = 1500, n.iter = 1500+10000, n.thin = 10, model.file = textConnection(model))
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 3020
## Unobserved stochastic nodes: 4
## Total graph size: 21486
##
## Initializing model
max(mod2$BUGSoutput$summary[, c("Rhat")])
## [1] 1.001509
min(mod2$BUGSoutput$summary[, c("n.eff")])
## [1] 2000
which.max(mod2$BUGSoutput$summary[, c("Rhat")])
## b1
## 2
which.min(mod2$BUGSoutput$summary[, c("n.eff")])
## b1
## 2
# just some trace plots
PlotTrace("b0", mod2$BUGSoutput$sims.array)
PlotTrace("b1", mod2$BUGSoutput$sims.array)
PlotTrace("b2", mod2$BUGSoutput$sims.array)
PlotTrace("b3", mod2$BUGSoutput$sims.array)
# some priors and posteriors to check that priors weren't informative
mcmc.array <- mod2$BUGSoutput$sims.array
par(lwd = 3, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5,mar = c(5,5,1,1), mfrow = c(1,4))
for (k in 0:3){
hist(c(mcmc.array[,,paste0("b", k, "")]), freq = F, main = "", xlab = paste0("Coeff ", k))
curve(dnorm(x,0,sqrt(1/10^(-4))), add = T, col = 2)
}
b0_mean <- mean(mcmc.array[,,"b0"])
b1_mean <- mean(mcmc.array[,,"b1"])
b2_mean <- mean(mcmc.array[,,"b2"])
b3_mean <- mean(mcmc.array[,,"b3"])
invlogit <- function(x) {exp(x)/(1 + exp(x))}
plot(x = c.arsenic, y =y.i ,ylab="P[switch]", xlab = "Arsenic Level", main = "Probability of Switching versus Arsenic Level")
curve (invlogit(a0_mean + a1_mean*(1-mean(dist100)) + a2_mean*x), add=TRUE)
curve (invlogit(b0_mean + b1_mean*(1-mean(dist100)) + b2_mean*x + b3_mean*(1-mean(dist100))*x), add=TRUE, col = "red", lty = 2)
legend("bottomright", legend = c("Model 2", "Model 3"), col = c(1,2), lty = c(1,2), cex = .5)
set.seed(12345)
# assign households to villages
J <- 300
getj1.i <- c(seq(1,J), sample(size = n-J, x = seq(1,J), replace = TRUE))
getj2.i <- sort(getj1.i) # now the households are assumed
#to be sorted by village
#-----
# fitting
model <- "
model {
for( i in 1:n) {
logit(p.i[i]) <- (
# different notation; using betas for main effects and b's for varying effects
b0 + b1*c.dist100[i] + b2*c.log.arsenic[i] + b3*c.log.arsenic[i]*c.dist100[i] + alpha.j[getj1.i[i]]
)
y.i[i] ~ dbern(p.i[i])
}
for (j in 1:J){
alpha.j[j] ~ dnorm(0, 1/sigma_alpha^2)
}
b0 ~ dnorm(0.0,1.0E-4)
b1 ~ dnorm(0.0,1.0E-4)
b2 ~ dnorm(0.0,1.0E-4)
b3 ~ dnorm(0.0,1.0E-4)
sigma_alpha ~ dunif(0,200)
} # end model
"
# with random slopes
jags.data2 <- list(y.i = y.i,
c.dist100 = c.dist100, c.log.arsenic=c.log.arsenic, n=n,getj1.i = getj1.i, J = J)
parnames2 <- c( "b1", "b0","b2", "b3","alpha.j","sigma_alpha")
# note: error messages may be less informative with jags parallel so go back to using "jags" (the non-parallel version)
# if you don't understand an error message
mod2 <- jags(data = jags.data2,
parameters.to.save=parnames2,
n.chains = 3, n.burnin = 1500, n.iter = 1500+10000, n.thin = 10, model.file = textConnection(model))
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 3020
## Unobserved stochastic nodes: 305
## Total graph size: 24815
##
## Initializing model
max(mod2$BUGSoutput$summary[, c("Rhat")])
## [1] 1.312799
min(mod2$BUGSoutput$summary[, c("n.eff")])
## [1] 15
which.max(mod2$BUGSoutput$summary[, c("Rhat")])
## sigma_alpha
## 306
which.min(mod2$BUGSoutput$summary[, c("n.eff")])
## sigma_alpha
## 306
# just some trace plots
PlotTrace("b0", mod2$BUGSoutput$sims.array)
PlotTrace("b1", mod2$BUGSoutput$sims.array)
PlotTrace("b2", mod2$BUGSoutput$sims.array)
PlotTrace("b3", mod2$BUGSoutput$sims.array)
# some priors and posteriors to check that priors weren't informative
mcmc.array <- mod2$BUGSoutput$sims.array
par(lwd = 3, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5,mar = c(5,5,1,1), mfrow = c(1,4))
for (k in 0:3){
hist(c(mcmc.array[,,paste0("b", k, "")]), freq = F, main = "", xlab = paste0("Coeff ", k))
curve(dnorm(x,0,sqrt(1/10^(-4))), add = T, col = 2)
}
b0_mean <- mean(mcmc.array[,,"b0"])
b1_mean <- mean(mcmc.array[,,"b1"])
b2_mean <- mean(mcmc.array[,,"b2"])
b3_mean <- mean(mcmc.array[,,"b3"])
sigma_alpha_mean_1 <- mean(mcmc.array[,,"sigma_alpha"])
set.seed(12345)
# assign households to villages
J <- 300
getj1.i <- c(seq(1,J), sample(size = n-J, x = seq(1,J), replace = TRUE))
getj2.i <- sort(getj1.i) # now the households are assumed
#to be sorted by village
#-----
# fitting
model <- "
model {
for( i in 1:n) {
logit(p.i[i]) <- (
# different notation; using betas for main effects and b's for varying effects
b0 + b1*c.dist100[i] + b2*c.log.arsenic[i] + alpha.j[getj1.i[i]]
)
y.i[i] ~ dbern(p.i[i])
}
for (j in 1:J){
alpha.j[j] ~ dnorm(0, 1/sigma_alpha^2)
}
b0 ~ dnorm(0.0,1.0E-4)
b1 ~ dnorm(0.0,1.0E-4)
b2 ~ dnorm(0.0,1.0E-4)
sigma_alpha ~ dunif(0,200)
} # end model
"
# with random slopes
jags.data2 <- list(y.i = y.i,
c.dist100 = c.dist100, c.log.arsenic=c.log.arsenic, n=n,getj1.i = getj2.i, J = J)
parnames2 <- c( "b1", "b0","b2", "b3","alpha.j","sigma_alpha")
# note: error messages may be less informative with jags parallel so go back to using "jags" (the non-parallel version)
# if you don't understand an error message
mod2 <- jags(data = jags.data2,
parameters.to.save=parnames2,
n.chains = 3, n.burnin = 1500, n.iter = 1500+10000, n.thin = 10, model.file = textConnection(model))
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 3020
## Unobserved stochastic nodes: 304
## Total graph size: 21795
##
## Initializing model
## Warning in jags.samples(model, variable.names, n.iter, thin, type = "trace", : Failed to set trace monitor for b3
## Variable b3 not found
max(mod2$BUGSoutput$summary[, c("Rhat")])
## [1] 1.005653
min(mod2$BUGSoutput$summary[, c("n.eff")])
## [1] 400
which.max(mod2$BUGSoutput$summary[, c("Rhat")])
## alpha.j[101]
## 101
which.min(mod2$BUGSoutput$summary[, c("n.eff")])
## alpha.j[101]
## 101
# just some trace plots
PlotTrace("b0", mod2$BUGSoutput$sims.array)
PlotTrace("b1", mod2$BUGSoutput$sims.array)
PlotTrace("b2", mod2$BUGSoutput$sims.array)
# some priors and posteriors to check that priors weren't informative
mcmc.array <- mod2$BUGSoutput$sims.array
par(lwd = 3, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5,mar = c(5,5,1,1), mfrow = c(1,4))
for (k in 0:2){
hist(c(mcmc.array[,,paste0("b", k, "")]), freq = F, main = "", xlab = paste0("Coeff ", k))
curve(dnorm(x,0,sqrt(1/10^(-4))), add = T, col = 2)
}
b0_mean <- mean(mcmc.array[,,"b0"])
b1_mean <- mean(mcmc.array[,,"b1"])
b2_mean <- mean(mcmc.array[,,"b2"])
sigma_alpha_mean_2 <- mean(mcmc.array[,,"sigma_alpha"])
We can see that the variance of the first group mean is smaller than that of the second group mean
print ("Grouping 1 sigma alpha")
## [1] "Grouping 1 sigma alpha"
print (sigma_alpha_mean_1)
## [1] 0.1334997
print ("Grouping 2 sigma alpha")
## [1] "Grouping 2 sigma alpha"
print (sigma_alpha_mean_2)
## [1] 1.049601
This make sense because we have reduced the variance within each group by putting households that are close to each other in the dataset , under the same village cluster. As long as there is any correlation between row i and row i+1 (such as survey collectors going from one town, to the town adjacent) this will reduce the variance within the cluster.
linpred <-b0_mean + (.5-mean(dist100))*b1_mean +(log(1.2)-mean(log.arsenic))*b2_mean
invlogit <- function(x) {exp(x)/(1 + exp(x))}
print ("J = 1")
## [1] "J = 1"
quantile(invlogit(linpred+mcmc.array[,,"alpha.j[1]"]),c(.025,.975))
## 2.5% 97.5%
## 0.3302953 0.9038543
print ("J = 2")
## [1] "J = 2"
quantile(invlogit(linpred+mcmc.array[,,"alpha.j[2]"]),c(.025,.975))
## 2.5% 97.5%
## 0.4954345 0.9539468
print ("J = 3")
## [1] "J = 3"
quantile(invlogit(linpred+mcmc.array[,,"alpha.j[3]"]),c(.025,.975))
## 2.5% 97.5%
## 0.6456555 0.9651341
print ("J = 4")
## [1] "J = 4"
quantile(invlogit(linpred+mcmc.array[,,"alpha.j[4]"]),c(.025,.975))
## 2.5% 97.5%
## 0.5268797 0.9274944
print ("J = 5")
## [1] "J = 5"
quantile(invlogit(linpred+mcmc.array[,,"alpha.j[5]"]),c(.025,.975))
## 2.5% 97.5%
## 0.4599345 0.8672441